# -------------------------------------------------------------------
# Purpose: Creates Table B3
# Author:  Max Posch, 25/07/2025
# Usage:   Source this script to generate the table.
# -------------------------------------------------------------------
# Check that required paths exist
stopifnot(dir.exists(pdataanalysis))
stopifnot(dir.exists(poutputappendix))


# Load surname data
load(file.path(pdataanalysis, "countyLevel19001940.RData"))
d <- copy(countyLevel19001940)[year %in% seq(1900, 1940, 10), .(gisjoin_1900, year, entropy_namelast_mp_adjp)][order(year, -entropy_namelast_mp_adjp)]
pop1900 <- copy(countyLevel19001940)[year == 1900, .(gisjoin_1900, pop1900 = sum_n_namelast_mp_adjp)]
pop1900[, pop1900 := pop1900 * 1000]
pop1940 <- copy(countyLevel19001940)[year == 1940, .(gisjoin_1900, pop1940 = sum_n_namelast_mp_adjp)]
pop1940[, pop1940 := pop1940 * 1000]


## counties with highest diversity 1940 
county_mapping <- data.frame(
    gisjoin_1900 = c("G1700310", "G3600470", "G3600610", "G5500790", "G3600810", "G2601630", "G3900350", "G3400170", "G3600290", "G3400310"),
    county_name = c("Cook County, IL", "Kings County, NY", "New York County, NY", "Milwaukee County, MN", "Queens County, NY", "Wayne County, MI", "Cuyahoga County, OH", "Hudson County, NJ", "Erie County, NY", "Passaic County, NJ"),
    city_name = c("Chicago", "Broolyn", "Manhattan", "Milwaukee", "Queens", "Detroit", "Cleveland", "Jersey City", "Buffalo", "Paterson")
)
tab1 <- d %>%
    filter(gisjoin_1900 %in% county_mapping$gisjoin_1900) %>%
    pivot_wider(names_from = year, values_from = entropy_namelast_mp_adjp, names_prefix = "year_", id_cols = gisjoin_1900) %>%
    inner_join(pop1900) %>%
    inner_join(pop1940) %>%
    select(gisjoin_1900, pop1900, pop1940, everything()) %>%
    arrange(desc(year_1940))
tab1_with_county <- tab1 %>%
    left_join(county_mapping, by = "gisjoin_1900")
tab1_with_county <- select(tab1_with_county, county_name, city_name, pop1900, pop1940, everything(), -gisjoin_1900)
tab1_with_county$pop1900 <- format(round(tab1_with_county$pop1900), big.mark = ",")
tab1_with_county$pop1940 <- format(round(tab1_with_county$pop1940), big.mark = ",")
tab1_xtable <- xtable(tab1_with_county)
tablename <- file.path(poutputappendix, "tableB03high.tex")
print(tab1_xtable,
    include.rownames = FALSE,
    file = tablename
)
get_clustering_coefs_rows(tablename)



## counties with lowest diversity 1940
county_mapping <- data.frame(
    gisjoin_1900 = c("G4803010", "G0600030", "G0800530", "G5100790", "G4900550", "G5100360", "G4900250", "G4900330", "G3700950", "G5100450"),
    county_name = c("Loving County, TX", "Alpine County, CA", "Hinsdale County, CO", "Greene County, VA", "Wayne County, UT", "Charles City, VA", "Kane County, UT", "Rich County, UT", "Hyde County, NC", "Craig County, VA"),
    city_name = c("", "", "", "", "", "", "", "", "", "")
)

tab2 <- d %>%
    filter(gisjoin_1900 %in% county_mapping$gisjoin_1900) %>%
    pivot_wider(names_from = year, values_from = entropy_namelast_mp_adjp, names_prefix = "year_", id_cols = gisjoin_1900) %>%
    inner_join(pop1900) %>%
    inner_join(pop1940) %>%
    select(gisjoin_1900, pop1900, pop1940, everything()) %>%
    arrange(year_1940)
tab2_with_county <- tab2 %>%
    left_join(county_mapping, by = "gisjoin_1900")
tab2_with_county <- select(tab2_with_county, county_name, city_name, pop1900, pop1940, everything(), -gisjoin_1900)
tab2_with_county$pop1900 <- format(round(tab2_with_county$pop1900), big.mark = ",")
tab2_with_county$pop1940 <- format(round(tab2_with_county$pop1940), big.mark = ",")
tab2_xtable <- xtable(tab2_with_county)
tablename <- file.path(poutputappendix, "tableB03low.tex")
print(tab2_xtable,
    include.rownames = FALSE,
    file = tablename
)
get_clustering_coefs_rows(tablename)


## average diversity 1940
tab3 <- d %>%
    pivot_wider(names_from = year, values_from = entropy_namelast_mp_adjp, names_prefix = "year_") %>%
    arrange(year_1940) %>% 
    inner_join(pop1900) %>%
    inner_join(pop1940) %>%
    summarize_if(is.numeric, mean) %>%
    select(pop1900, pop1940, everything())
tab3$pop1900 <- format(round(tab3$pop1900), big.mark = ",")
tab3$pop1940 <- format(round(tab3$pop1940), big.mark = ",")
tab3_xtable <- xtable(tab3)
tablename <- file.path(poutputappendix, "tableB03avg.tex")
print(tab3_xtable,
    include.rownames = FALSE,
    file = tablename
)
get_clustering_coefs_rows(tablename)
cat("Table B3 saved to:", tablename, "\n")